Ejercicio

1. Instalación librerias

Se instalan las sigueitnes librerías

# Instalar Librerias
library(knitrBootstrap) # archivo css
library(tidyverse) # manipulación de datos
library(readxl) # lectura y crga de archivos excel
library(GGally) # graficos (matriz) analisis exploratorio
library(janitor) # limpieza de datos
library(naniar) # visualizacion y tratamiento de datos perdidos
library(DataExplorer)
library(visdat) # visualizaciones de datos exploratorios preliminares 
library(devtools)
#devtools::install_github("XanderHorn/autoEDA")
library(autoEDA)
library(tidyverse)
library(FactoMineR) 
library(factoextra)

2. Carga de datos

Uso de la base Rolling Sales Data, esta base contiene información de ventas del departametno de Finanzas de Estados Unidos.

brooklyn <- read_excel("data/rollingsales_brooklyn.xls", skip = 4)
head(brooklyn,10)
## # A tibble: 10 x 21
##    BOROUGH NEIGHBORHOOD `BUILDING CLASS CATEGO~ `TAX CLASS AT PRESE~ BLOCK   LOT
##    <chr>   <chr>        <chr>                   <chr>                <dbl> <dbl>
##  1 3       BATH BEACH   01 ONE FAMILY DWELLINGS 1                     6359    70
##  2 3       BATH BEACH   01 ONE FAMILY DWELLINGS 1                     6360    56
##  3 3       BATH BEACH   01 ONE FAMILY DWELLINGS 1                     6362    23
##  4 3       BATH BEACH   01 ONE FAMILY DWELLINGS 1                     6367    24
##  5 3       BATH BEACH   01 ONE FAMILY DWELLINGS 1                     6371    60
##  6 3       BATH BEACH   01 ONE FAMILY DWELLINGS 1                     6380    73
##  7 3       BATH BEACH   01 ONE FAMILY DWELLINGS 1                     6392   115
##  8 3       BATH BEACH   01 ONE FAMILY DWELLINGS 1                     6399    13
##  9 3       BATH BEACH   01 ONE FAMILY DWELLINGS 1                     6409    40
## 10 3       BATH BEACH   01 ONE FAMILY DWELLINGS 1                     6414    38
## # ... with 15 more variables: EASE-MENT <lgl>, BUILDING CLASS AT PRESENT <chr>,
## #   ADDRESS <chr>, APARTMENT NUMBER <chr>, ZIP CODE <dbl>,
## #   RESIDENTIAL UNITS <dbl>, COMMERCIAL UNITS <dbl>, TOTAL UNITS <dbl>,
## #   LAND SQUARE FEET <dbl>, GROSS SQUARE FEET <dbl>, YEAR BUILT <dbl>,
## #   TAX CLASS AT TIME OF SALE <chr>, BUILDING CLASS AT TIME OF SALE <chr>,
## #   SALE PRICE <dbl>, SALE DATE <dttm>

3. Exploracion de datos

tab1 <- introduce(brooklyn) %>% select(-all_missing_columns, -total_missing_values,
                           -complete_rows, 
                           -memory_usage) %>% 
                    select(Filas=rows, Columnas=columns, 
                           `Columnas Caracter`=discrete_columns,
                           `Columnas Numéricas`=continuous_columns,
                           `Observaciones Disponibles`= total_observations)

head(tab1)
## # A tibble: 1 x 5
##   Filas Columnas `Columnas Caracter` `Columnas Numérica~ `Observaciones Disponi~
##   <int>    <int>               <int>               <int>                   <int>
## 1 19244       21                  10                  10                  404124

De la exploración se puede observar que hay 10 columnas con valores categóricos y numéricos respectivamente. Adicional existe una colmna con valores perdidos. Todo esto en una base de 404124 observaciones

3.1 Gráficos

tab2 <- plot_missing(brooklyn)

Se observa que la variable EASE-MENT es una columna con valores total mente faltantes, esta columna puede ser omitida del analisis.

brooklyn %>% select(where(is.numeric)) %>% ggpairs()

4. Limpieza de Datos

# Limpieza de nombres
NOM <- brooklyn %>% clean_names(case = "all_caps")

# Vistazo a la base final
NOM %>% glimpse()
## Rows: 19,244
## Columns: 21
## $ BOROUGH                        <chr> "3", "3", "3", "3", "3", "3", "3", "3",~
## $ NEIGHBORHOOD                   <chr> "BATH BEACH", "BATH BEACH", "BATH BEACH~
## $ BUILDING_CLASS_CATEGORY        <chr> "01 ONE FAMILY DWELLINGS", "01 ONE FAMI~
## $ TAX_CLASS_AT_PRESENT           <chr> "1", "1", "1", "1", "1", "1", "1", "1",~
## $ BLOCK                          <dbl> 6359, 6360, 6362, 6367, 6371, 6380, 639~
## $ LOT                            <dbl> 70, 56, 23, 24, 60, 73, 115, 13, 40, 38~
## $ EASE_MENT                      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ BUILDING_CLASS_AT_PRESENT      <chr> "S1", "A5", "A9", "A9", "A9", "S1", "A5~
## $ ADDRESS                        <chr> "8684 15TH AVENUE", "30 BAY 10TH STREET~
## $ APARTMENT_NUMBER               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ ZIP_CODE                       <dbl> 11228, 11228, 11228, 11214, 11214, 1121~
## $ RESIDENTIAL_UNITS              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ COMMERCIAL_UNITS               <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, ~
## $ TOTAL_UNITS                    <dbl> 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, ~
## $ LAND_SQUARE_FEET               <dbl> 1933, 1547, 2280, 1571, 2417, 1740, 144~
## $ GROSS_SQUARE_FEET              <dbl> 4080, 1428, 1052, 1456, 2106, 2124, 128~
## $ YEAR_BUILT                     <dbl> 1930, 1930, 1901, 1935, 1930, 1960, 194~
## $ TAX_CLASS_AT_TIME_OF_SALE      <chr> "1", "1", "1", "1", "1", "1", "1", "1",~
## $ BUILDING_CLASS_AT_TIME_OF_SALE <chr> "S1", "A5", "A9", "A9", "A9", "S1", "A5~
## $ SALE_PRICE                     <dbl> 1300000, 75000, 0, 830000, 1188000, 990~
## $ SALE_DATE                      <dttm> 2020-04-28, 2020-11-30, 2020-11-04, 20~
# Descripción del tipo de variables
NOM %>% vis_dat(warn_large_data = F)

# Datos perdidos
NOM %>% vis_miss(warn_large_data = F)

# Eliminación de variables perdidas
NOM <- NOM %>% 
  select(-EASE_MENT)

5. Analisis Patrones

# Correlaciones entre variables
NOM %>% 
  select(where(is.numeric)) %>% vis_cor()

# Patrones en datos perdidos
NOM %>% gg_miss_upset()

6. Transponer Variables

# Transformación columnas a filas
NOM %>% 
  group_by(NEIGHBORHOOD) %>% 
  summarise(TOTAL_UNITS = sum(TOTAL_UNITS, na.rm = T),
            RESIDENTIAL_UNITS = sum(RESIDENTIAL_UNITS, na.rm = T),
            COMMERCIAL_UNITS = sum(COMMERCIAL_UNITS, na.rm = T)) %>% 
  head()
## # A tibble: 6 x 4
##   NEIGHBORHOOD       TOTAL_UNITS RESIDENTIAL_UNITS COMMERCIAL_UNITS
##   <chr>                    <dbl>             <dbl>            <dbl>
## 1 BATH BEACH                 529               490               39
## 2 BAY RIDGE                 1059               940              119
## 3 BEDFORD STUYVESANT        3049              2936              113
## 4 BENSONHURST               1028               976               52
## 5 BERGEN BEACH               280               262               18
## 6 BOERUM HILL                334               300               34
NOM %>% 
  group_by(NEIGHBORHOOD) %>% 
  summarise(TOTAL_UNITS = sum(TOTAL_UNITS, na.rm = T),
            RESIDENTIAL_UNITS = sum(RESIDENTIAL_UNITS, na.rm = T),
            COMMERCIAL_UNITS = sum(COMMERCIAL_UNITS, na.rm = T)) %>% 
  pivot_longer(cols = -c(NEIGHBORHOOD), names_to = "UNIT_TYPE", values_to = "NUMBER_UNITS") %>% 
  head()
## # A tibble: 6 x 3
##   NEIGHBORHOOD UNIT_TYPE         NUMBER_UNITS
##   <chr>        <chr>                    <dbl>
## 1 BATH BEACH   TOTAL_UNITS                529
## 2 BATH BEACH   RESIDENTIAL_UNITS          490
## 3 BATH BEACH   COMMERCIAL_UNITS            39
## 4 BAY RIDGE    TOTAL_UNITS               1059
## 5 BAY RIDGE    RESIDENTIAL_UNITS          940
## 6 BAY RIDGE    COMMERCIAL_UNITS           119
# Transformación filas a columnas
NOM %>% 
  group_by(NEIGHBORHOOD) %>% 
  summarise(TOTAL_UNITS = sum(TOTAL_UNITS, na.rm = T),
            RESIDENTIAL_UNITS = sum(RESIDENTIAL_UNITS, na.rm = T),
            COMMERCIAL_UNITS = sum(COMMERCIAL_UNITS, na.rm = T)) %>% 
  pivot_longer(cols = -c(NEIGHBORHOOD), names_to = "UNIT_TYPE", values_to = "NUMBER_UNITS") %>% 
  pivot_wider(names_from = c(UNIT_TYPE), values_from = c(NUMBER_UNITS), values_fill = 0) %>% 
  head()
## # A tibble: 6 x 4
##   NEIGHBORHOOD       TOTAL_UNITS RESIDENTIAL_UNITS COMMERCIAL_UNITS
##   <chr>                    <dbl>             <dbl>            <dbl>
## 1 BATH BEACH                 529               490               39
## 2 BAY RIDGE                 1059               940              119
## 3 BEDFORD STUYVESANT        3049              2936              113
## 4 BENSONHURST               1028               976               52
## 5 BERGEN BEACH               280               262               18
## 6 BOERUM HILL                334               300               34

7. Reducción de columnas

pca_res <- PCA(NOM %>% select(where(is.numeric)), scale.unit = T)

pca_res %>% fviz_screeplot()

pca_res %>% fviz_pca_biplot()

pca_res = PCA(NOM %>% 
                slice(-43798) %>% 
                select(where(is.numeric)), scale.unit = T)

pca_res %>% fviz_screeplot()

pca_res %>% fviz_pca_biplot()